/* ************************************************************************************************ */
/* This code cleans the various data issues in TRACE Standard File                                  */
/* Logic of the cleaning procedure largely follows the discussion of Dick-Nielsen (2009) and (2014) */
/* Cleaning procedure takes care of Cancellation, Correction, Reversal and Double Counting          */
/* Author: Qingyi (Freda) Song Drechsler                                                            */
/* Date:   Written and Tested in October 2017                                                       */
/* ************************************************************************************************ */

libname fisd  ('/wrds/mergent/sasdata/fisd' '/wrds/mergent/sasdata/naic');
libname trace  '/wrds/trace/sasdata/standard';
libname tracee '/wrds/trace/sasdata/enhanced';
libname comp '/wrds/comp/sasdata/naa';
libname mylib  '<ADD OUTPUT PATH ON WRDS SERVER HERE>';

/* **************************************** */
/* Step 1: Reassign Volume and Other Values */
/* **************************************** */
*** Num of Obs (Starting Sample): 124,625,471;

data _clean1;
set trace.trace;
where cusip_id ne '';

* Convert Text Upper Volume Bound to Number;
if ASCII_RPTD_VOL_TX='5MM+' then ASCII_RPTD_VOL_TX='5000000';
if ASCII_RPTD_VOL_TX='1MM+' then ASCII_RPTD_VOL_TX='1000000';
ENTRD_VOL_QT=input(ASCII_RPTD_VOL_TX,8.); * convert to number;

* Convert Different TRC_ST values to be uniform;
if TRC_ST in ('G','M') then TRC_ST='T';
if TRC_ST in ('H','N') then TRC_ST='C';
if TRC_ST in ('I','O') then TRC_ST='W';
keep cusip_id bond_sym_id bsym trd_exctn_dt trd_exctn_tm msg_seq_nb trc_st wis_fl cmsn_trd 
     entrd_vol_qt rptd_pr yld_pt asof_cd side diss_rptg_side_cd orig_msg_seq_nb 
     orig_dis_dt rptg_party_type contra_party_type ;
rename DISS_RPTG_SIDE_CD=RPT_SIDE_CD;  
run; *** 124,578,019;



data _c; set _clean1; where trc_st in ('C'); run; ***   1,756,035;
data _w; set _clean1; where trc_st in ('W'); run; ***   1,526,718;
data _t; set _clean1; where trc_st in ('T'); run; *** 121,295,266;

/* ************************************* */
/* Step 2: Remove Cancellation Cases (C) */
/* ************************************* */

* Match Same-day cancellation using:
* Cusip_ID, Exectution Date and Time, Quantity, Price;
* C records show ORIG_MSG_SEQ_NB matching the Trade Record MSG_SEQ_NB;
proc sql;
 create table _clean2 as select distinct a.*, b.trc_st as trc_st_c
 from _T as a left join _C as b
 on a.cusip_id = b.cusip_id
 and a.trd_exctn_dt = b.trd_exctn_dt
 and a.trd_exctn_tm = b.trd_exctn_tm
 and a.rptd_pr = b.rptd_pr
 and a.entrd_vol_qt = b.entrd_vol_qt
 and a.msg_seq_nb = b.orig_msg_seq_nb;
quit;

data _del_c; set _clean2; where trc_st_c='C'; run; *** 1,715,600;
data _clean2; set _clean2; where trc_st_c =''; drop trc_st_c; run; *** 119,579,659;

********************************* 
* Summary of Cleaning of Step 2 *
* Starting Obs: 124,578,019     *
* Ending Obs:   119,579,659     *
* Pct Cleaned:  4.0%            *
*********************************;

/* *********************************** */
/* Step 3: Remove Correction Cases (W) */
/* *********************************** */

* NOTE: on a given day, a bond can have more than one round of correction
* One W to correct an older W, which then corrects the original T
* Before joining back to the T data, first need to clean out the W to handle the situation described above;
* The following section handles the chain of W cases;

* 3.1 Sort out all msg_seq_nb;
data __w_msg; set _w; keep cusip_id bond_sym_id trd_exctn_dt trd_exctn_tm msg_seq_nb flag;
flag='msg'; run;

* Sort out all mapped original msg_seq_nb;
data __w_omsg; set _w; keep cusip_id bond_sym_id trd_exctn_dt trd_exctn_tm orig_msg_seq_nb flag;
flag='omsg';
rename orig_msg_seq_nb = msg_seq_nb;
run;

data __w; set __w_omsg __w_msg; run;

* 3.2 Count the number of appearance (napp) of a msg_seq_nb: 
* If appears more than once then it is part of later correction;
proc sql;
 create table __w_napp as select distinct cusip_id, bond_sym_id, trd_exctn_dt, trd_exctn_tm, msg_seq_nb, count(*) as napp
 from __w group by cusip_id, bond_sym_id, trd_exctn_dt, trd_exctn_tm, msg_seq_nb;
quit;

* 3.3 Check whether one msg_seq_nb is associated with both msg and orig_msg or only to orig_msg;
* If msg_seq_nb appearing more than once is associated with only orig_msg - 
* It means that more than one msg_seq_nb is linked to the same orig_msg_seq_nb for correction. 
* Examples: cusip_id='362320AX1' and trd_Exctn_dt='04FEB2005'd (3 cases like this in total)
* If ntype=2 then a msg_seq_nb is associated with being both msg_seq_nb and orig_msg_seq_nb;
proc sql;
 create table __w_mult as select distinct cusip_id, bond_sym_id, trd_exctn_dt, trd_exctn_tm, msg_seq_nb, flag from __w; quit;
proc sql;
 create table __w_mult1 as select distinct cusip_id, bond_sym_id, trd_exctn_dt, trd_exctn_tm, msg_seq_nb, count(*) as ntype 
 from __w_mult group by cusip_id, trd_exctn_dt, trd_exctn_tm, msg_seq_nb;
quit;

* 3.4 Combine the npair and ntype info;
proc sql;
 create table __w_comb as select distinct a.*, b.ntype
 from __w_napp as a left join __w_mult1 as b
 on a.cusip_id=b.cusip_id and a.trd_exctn_dt=b.trd_exctn_dt and a.trd_exctn_tm=b.trd_exctn_tm and a.msg_seq_nb=b.msg_seq_nb
 order by a.cusip_id, a.trd_exctn_dt, a.trd_exctn_tm;
quit;

* Map back by matching CUSIP Excution Date and Time to remove msg_seq_nb that appears more than once;
* If napp=1 or (napp>1 but ntype=1);
proc sql;
 create table __w_keep as select distinct a.*, b.flag
 from __w_comb (where=(napp=1 or (napp>1 and ntype=1))) as a inner join __w as b
 on a.cusip_id=b.cusip_id and a.trd_exctn_dt=b.trd_exctn_dt and a.trd_exctn_tm=b.trd_exctn_tm
 and a.msg_seq_nb=b.msg_seq_nb
 order by a.cusip_id, a.trd_exctn_dt, a.trd_exctn_tm;
quit;

*3.5 Caluclate no of pair of records;
proc sql; 
 create table __w_keep as select distinct *, count(*)/2 as npair 
 from __w_keep group by cusip_id, trd_exctn_dt, trd_exctn_tm
 order by cusip_id, trd_exctn_dt, trd_exctn_tm;
quit;

* For records with only one pair of entry at a given time stamp - transpose using the flag information;
proc transpose data = __w_keep out= __w_keep1 ;
where npair=1;
by cusip_id trd_exctn_dt trd_exctn_tm;
var msg_seq_nb;
id flag;
run;

data __w_keep1; set __w_keep1; drop _name_ _label_; rename msg=msg_seq_nb omsg=orig_msg_seq_nb; run;

* For records with more than one pair of entry at a given time stamp - join back the original msg_seq_nb;
proc sql;
 create table __w_keep2 as select distinct a.cusip_id, a.bond_sym_id, a.trd_exctn_dt, a.trd_exctn_tm, a.msg_seq_nb, b.orig_msg_seq_nb
 from __w_keep (where=(flag='msg' and npair>1)) as a left join _w as b
 on a.cusip_id=b.cusip_id and a.trd_exctn_dt=b.trd_exctn_dt and a.trd_exctn_tm=b.trd_exctn_tm 
 and a.msg_seq_nb=b.msg_seq_nb;
quit;

data __w_clean; set __w_keep1 __w_keep2; drop bond_sym_id; run;

* 3.6 Join back to get all the other information;
proc sql;
 create table _w_clean as select distinct a.*, b.*
 from __w_clean as a left join _w (drop=orig_msg_seq_nb) as b
 on a.cusip_id=b.cusip_id and a.trd_exctn_dt = b.trd_exctn_dt and a.trd_exctn_tm = b.trd_exctn_tm and a.msg_seq_nb = b.msg_seq_nb;
quit;


/* 3.7 Match up with Trade Record data to delete the matched T record */;
* Matching by Cusip_ID, Date, and MSG_SEQ_NB;
* W records show ORIG_MSG_SEQ_NB matching orignal record MSG_SEQ_NB;
proc sql;
 create table _clean3 as select distinct a.*, 
 b.trc_st as trc_st_w, b.msg_seq_nb as mod_msg_seq_nb, b.orig_msg_seq_nb as mod_orig_msg_seq_nb
 from _clean2 as a left join _w_clean as b
 on a.cusip_id = b.cusip_id
 and a.trd_exctn_dt = b.trd_exctn_dt
 and a.msg_seq_nb = b.orig_msg_seq_nb;
quit; ***  119,579,665;

data _del_w; set _clean3; where trc_st_w = 'W'; run; *** 1,428,830;

* Delete matched T records;
data _clean3; set _clean3; where trc_st_w =''; drop trc_st_w MOD_msg_seq_nb MOD_orig_msg_seq_nb; run; ***   118,150,835;
 
* Replace T records with corresponding W records;
* Filter out W records with valid matching T from the previous step;
proc sql;
 create table _rep_w as select distinct a.*, b.trc_st_w, b.MOD_msg_seq_nb, b.MOD_orig_msg_seq_nb
 from _w_clean as a left join _del_w as b
 on a.cusip_id = b.cusip_id
 and a.trd_exctn_dt = b.trd_exctn_dt
 and a.msg_seq_nb = b.mod_msg_seq_nb;
quit;

data _rep_w; set _rep_w; where trc_st_w = 'W'; drop trc_st_w MOD_orig_msg_seq_nb Mod_msg_seq_nb; run; *** 1,428,745;

proc sort data = _rep_w out=_rep_w nodupkey;
    by cusip_id trd_exctn_dt msg_seq_nb orig_msg_seq_nb rptd_pr entrd_vol_qt;
run; ***  1,428,741;

* Combine the cleaned T records and correct replacement W records;
data _clean4; set _clean3 _rep_w; run; ***  119,579,576;

********************************* 
* Summary of Cleaning of Step 3 *
* Starting Obs: 119,579,659     *
* Ending Obs:   119,579,576     *
* Pct Cleaned:  0.0%            *
*********************************;

/* ***************************** */
/* Step 4: Remove Reversal Cases */
/* ***************************** */
data _rev_header; set _clean4; where asof_cd ='R'; 
    keep cusip_id bond_sym_id trd_exctn_dt trd_exctn_tm entrd_vol_qt rptd_pr rpt_side_cd contra_party_type;
run; ***  804,594;

* Match by only 6 keys: CUSIP_ID, Execution Date, Vol, Price, B/S and C/D (remove the time dimension);
proc sort data = _rev_header; 
 by cusip_id bond_sym_id trd_exctn_dt entrd_vol_qt rptd_pr rpt_side_cd contra_party_type trd_exctn_tm;
run;

data _rev_header6; set _rev_header; 
    seq+1;
    by cusip_id bond_sym_id trd_exctn_dt entrd_vol_qt rptd_pr rpt_side_cd contra_party_type;
    if first.contra_party_type then seq=1;
run;

* Create the same ordering among the non-reversal records;
* Remove records that are R (reversal) D (Delayed dissemination) and X (delayed reversal);
data _clean5; set _clean4; where asof_cd not in ('R', 'X', 'D'); run; ***  118,694,374;

data _clean5_header; set _clean5; 
    keep cusip_id bond_sym_id trd_exctn_dt trd_exctn_tm entrd_vol_qt rptd_pr rpt_side_cd contra_party_type msg_seq_nb;
run;


* Match by 6 keys (excluding execution time);
proc sort data = _clean5_header;
  by cusip_id bond_sym_id trd_exctn_dt entrd_vol_qt rptd_pr rpt_side_cd contra_party_type trd_exctn_tm msg_seq_nb;
run;

data _clean5_header; set _clean5_header; 
    seq6+1;
    by cusip_id bond_sym_id trd_exctn_dt entrd_vol_qt rptd_pr rpt_side_cd contra_party_type;
    if first.contra_party_type then seq6=1;
run;

* Join Reversal with Non-Reversal to delete the corresponding ones;
proc sql;
 create table _clean5_header as select distinct a.*, b.seq as rev_seq6
 from _clean5_header as a 
 left join _rev_header6 as b
 on a.cusip_id      = b.cusip_id 
 and a.trd_exctn_dt = b.trd_exctn_dt
 and a.entrd_vol_qt = b.entrd_vol_qt
 and a.rptd_pr      = b.rptd_pr
 and a.rpt_side_cd  = b.rpt_side_cd
 and a.contra_party_type  = b.contra_party_type
 and a.seq6         = b.seq;
quit;

data _rev_matched6; set _clean5_header; where rev_seq6 ne .; run;
*** If matching using 6 keys    763,835 out of 804,594 reversal records were matched (95%);

* As 6 key matching has a higher record of finding reversal match, use the 6 keys results now;
data _clean5_header; set _clean5_header;
    where rev_seq6=.;
    drop rev_seq6;
run; ***  117,930,539;

proc sql;
 create table _clean6 as select distinct a.*
 from _clean5 as a , _clean5_header as b
 where a.cusip_id=b.cusip_id
 and a.trd_exctn_dt = b.trd_exctn_dt
 and a.trd_exctn_tm = b.trd_exctn_tm
 and a.entrd_vol_qt = b.entrd_vol_qt
 and a.rptd_pr      = b.rptd_pr
 and a.rpt_side_cd  = b.rpt_side_cd
 and a.contra_party_type  = b.contra_party_type
 and a.msg_seq_nb   = b.msg_seq_nb;
quit; ***  117,930,539;

********************************* 
* Summary of Cleaning of Step 4 *
* Starting Obs: 119,579,576     *
* Ending Obs:   117,930,539     *
* Pct Cleaned:  1.4%            *
*********************************;


/* ******************************** */
/* Step 5: Clean Agency Transaction */
/* ******************************** */

* New variables available starting in June 2015: SIDE Contra_party_type Rptg_party_type;
data _clean6; set _clean6; rpt_side_cd = COALESCEC(rpt_side_cd, side); run;

* 3.1 Remove trades double reported by both buy and sell of the inter-dealer trade;
data _agency_s; set _clean6;
where rpt_side_cd = 'S' and contra_party_type = 'D';
run; 

data _agency_b; set _clean6;
where rpt_side_cd = 'B' and contra_party_type = 'D';
run; *** 0 Hence no need to adjust for agency transaction in the standard feed;

data mylib.trace_standard_clean; set _clean6; run;

********************************* 
* Summary of Entire Procedure   *
* Starting Obs: 124,625,471     *
* Ending Obs:   117,930,539     *
* Pct Cleaned:  5.4%            *
*********************************;

/* House Clean */;
proc datasets;
delete _agency: _c _clean: _del: _rep: _rev: _t _w _w: __:; 
quit;
run;